home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Power Programmierung
/
Power-Programmierung (Tewi)(1994).iso
/
qtawk
/
slike.exp
< prev
next >
Wrap
Text File
|
1990-04-23
|
4KB
|
139 lines
# QTAwk program to find words that "sound like" pre-determined words
#
BEGIN {
# define words to find sound alikes for here
#########################
pattrn = "pattern";
file = "file";
example = "example";
pattrns = soundex(pattrn);
files = soundex(file);
examples = soundex(example);
}
INITIAL {
print FILENAME;
}
{
local word;
print FNR;
for ( i = 1 ; i <= NF ; i++ ) {
word = soundex($i);
switch ( word ) { # Find sound alikes
case pattrns:
print FNR "R : " $i " sounds like "pattrn;
break;
case files:
print FNR "R : " $i " sounds like "file;
break;
case examples:
print FNR "R : " $i " sounds like "example;
break;
}
}
}
# SOUNDIX Version 1.0
#
# This program takes a character string such as a person's last
# name and translates it to a sound index. This index can then be
# used by an application to perform phonetic (i.e. 'sounds-like')
# search. Algorithm found in D. Knuth, "Art of Computer Programming",
# Vol. 3, Page 391-392
#
# Rules:
# =====
#
# 1) Retain the first letter of the name and drop all occurances of
# a, e, h, i, o, u, w, and y in other positions
# 2) assign the following numbers to the remaining letters after the first
# bfpv ==> 1
# cgjkqsxz ==> 2
# dt ==> 3
# l ==> 4
# mn ==> 5
# r ==> 6
#
# 3) if two or more letters with the same code were adjacent in the original
# string (before step 1), omit all but the first
#
# 4) convert to the form "letter, digit, digit, digit" by adding trailing
# zeros (if there are less than three digits) by dropping rightmost
# digits (if there are more than three).
#
#
# Logic:
# =====
#
# 1) Uppercase the string
# 2) Use suffix to first letter
# 3) Change the following letters:
# R to 6
# M,N to 5
# L to 4
# D,T to 3
# C,G,J,K,Q,S,X,Z to 2
# B,F,P,V to 1
# AEIOUYHW to 0
# anything else to 0
# 4) Remove all adjacent duplicates
# 5) Remove all zeros
#
# Example: ( and marks duplicates which are deleted )
# =======
#
# McClowry --> 52240060 --> 5246 --> M246
# McLorey --> 5240600 --> 5246 --> M246
#
# Schiller --> 22004406 --> 246 --> S460
# Shilar --> 200406 --> 246 --> S460
#
# Rosen --> 60205 --> 625 --> R250
# Rozin --> 60205 --> 625 --> R250
#
# Moynihan --> 50050005 --> 555 --> M550
# Monnihan --> 50550005 --> 555 --> M550
#
# Abete --> 01030 --> 013 --> A130
# Abadey --> 010300 --> 013 --> A130
#
#
function soundex(str) {
local ldl;
local t_from = "|@#$%:;&*()_-+=![]'{}?/<>.~`^1234567890AEIOUYHWBFPVCGJKQSXZDTLMNR\\";
local t_to = "000000000000000000000000000000000007000000000001111222222223345560";
str = strupr(str);
ldl = substr(str,1,1); # rule 1
gsub(/^[AEIOUYH]/,"7",str); # reserve leading "AEIOUYH"
str = stran(str,t_to,t_from); # rule 2
gsub(/11+/,"1",str); # replace duplicate 1's with single 1 rule 3
gsub(/22+/,"2",str); # replace duplicate 2's with single 2 rule 3
gsub(/33+/,"3",str); # replace duplicate 3's with single 3 rule 3
gsub(/44+/,"4",str); # replace duplicate 4's with single 4 rule 3
gsub(/55+/,"5",str); # replace duplicate 5's with single 5 rule 3
gsub(/66+/,"6",str); # replace duplicate 6's with single 6 rule 3
gsub(/0+/,"",str); # delete internal 0's, rule 1
str = ldl substr(str,2); # glue leading character back on front
if ( (ldl = length(str)) < 4 ) {
switch ( ldl ) {
case 1:
str ∩= "000";
break;
case 2:
str ∩= "00";
break;
case 3:
str ∩= "0";
break;
}
} else if ( ldl > 4 ) str = substr(str,1,4);
return str;
}